In [1]:
# Basic packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # generating random numbers
import datetime # manipulating date formats
# Viz
%matplotlib inline
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns # for prettier plots
In [2]:
sales=pd.read_csv("Orders-Table 1.csv")

#formatting the date column correctly
sales.date=sales.OrderDate.apply(lambda x:datetime.datetime.strptime(x, '%m/%d/%y'))
# check
print(sales.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
Row ID           9994 non-null int64
Order ID         9994 non-null object
OrderDate        9994 non-null object
Ship Date        9994 non-null object
Ship Mode        9994 non-null object
Customer ID      9994 non-null object
Customer Name    9994 non-null object
Segment          9994 non-null object
Country          9994 non-null object
City             9994 non-null object
State            9994 non-null object
Postal Code      9994 non-null int64
Region           9994 non-null object
Product ID       9994 non-null object
Category         9994 non-null object
Sub-Category     9994 non-null object
Product Name     9994 non-null object
Sales            9994 non-null float64
Quantity         9994 non-null int64
Discount         9994 non-null float64
Profit           9994 non-null float64
dtypes: float64(3), int64(3), object(15)
memory usage: 1.6+ MB
None
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/ipykernel_launcher.py:4: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  after removing the cwd from sys.path.
In [3]:
sales['orderyear'] = sales.date.dt.year
sales['ordermonth'] = sales.date.dt.month
In [4]:
sales['monthyear'] = sales.apply(lambda row :
                          (row.orderyear,row.ordermonth), 
                                    axis=1)
newdf = sales[['monthyear', 'Sales', 'Profit']].copy()

newdf.groupby([newdf.monthyear]).sum().plot()
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x1109afbe0>
In [5]:
# matplotlibdf = pd.read_csv("Orders-Table 1.csv",parse_dates=['OrderDate'], index_col='OrderDate')
matplotlibdf = sales[['OrderDate', 'Sales', 'Profit']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
#matplotlibdf.Sales.resample('M',how='sum')
#matplotlibdf
linechartData = matplotlibdf.groupby([pd.Grouper(freq='MS',key='OrderDate')]).sum()
linechartData.plot(figsize=(20,10))
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d79dd30>
In [6]:
# Plotting Rolling mean

linechartData.rolling(window=3).mean().plot()
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1105fa2e8>
In [7]:
import statsmodels.api as sm
# multiplicative

res = sm.tsa.seasonal_decompose(linechartData.Sales,freq=12,model="multiplicative")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
In [8]:
# TIME SERIES
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs



# Stationarity tests
def test_stationarity(timeseries):
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
    
In [9]:
print("For Sales")
test_stationarity(linechartData.iloc[:,0].values)
print("\n\nFor Profit")
test_stationarity(linechartData.iloc[:,1].values)
For Sales
Results of Dickey-Fuller Test:
Test Statistic                 -4.493768
p-value                         0.000202
#Lags Used                      0.000000
Number of Observations Used    47.000000
Critical Value (1%)            -3.577848
Critical Value (5%)            -2.925338
Critical Value (10%)           -2.600774
dtype: float64


For Profit
Results of Dickey-Fuller Test:
Test Statistic                 -2.828475
p-value                         0.054320
#Lags Used                      3.000000
Number of Observations Used    44.000000
Critical Value (1%)            -3.588573
Critical Value (5%)            -2.929886
Critical Value (10%)           -2.603185
dtype: float64

Negative test statistic and p value less than 0.05 means values are stationary.

In [10]:
# to remove trend
from pandas import Series as Series
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

# invert differenced forecast
def inverse_difference(last_ob, value):
    return value + last_ob
In [11]:
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.plot(linechartData.Profit)
plt.plot()

plt.subplot(312)
plt.title('After De-trend')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(linechartData.Profit)
plt.plot(new_ts)
plt.plot()

plt.subplot(313)
plt.title('After De-seasonalization')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(linechartData.Profit,12)       # assuming the seasonality is 12 months long
plt.plot(new_ts)
plt.plot()
Out[11]:
[]
In [12]:
test_stationarity(new_ts)
Results of Dickey-Fuller Test:
Test Statistic                -8.662415e+00
p-value                        4.782827e-14
#Lags Used                     0.000000e+00
Number of Observations Used    3.500000e+01
Critical Value (1%)           -3.632743e+00
Critical Value (5%)           -2.948510e+00
Critical Value (10%)          -2.613017e+00
dtype: float64
In [13]:
def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 
In [14]:
# Simulate an AR(1) process with alpha = 0.6
np.random.seed(1)
n_samples = int(1000)
a = 0.6
x = w = np.random.normal(size=n_samples)

for t in range(n_samples):
    x[t] = a*x[t-1] + w[t]
limit=12    
_ = tsplot(x, lags=limit,title="AR(1)process")
In [15]:
# Simulate an AR(2) process

n = int(1000)
alphas = np.array([.444, .333])
betas = np.array([0.])

# Python requires us to specify the zero-lag value which is 1
# Also note that the alphas for the AR model must be negated
# We also set the betas for the MA equal to 0 for an AR(p) model
# For more information see the examples at statsmodels.org
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

ar2 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
_ = tsplot(ar2, lags=12,title="AR(2) process")
In [16]:
# Simulate an MA(1) process
n = int(1000)
# set the AR(p) alphas equal to 0
alphas = np.array([0.])
betas = np.array([0.8])
# add zero-lag and negate alphas
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]
ma1 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) 
limit=12
_ = tsplot(ma1, lags=limit,title="MA(1) process")
In [17]:
# Simulate MA(2) process with betas 0.6, 0.4
n = int(1000)
alphas = np.array([0.])
betas = np.array([0.6, 0.4])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

ma3 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n)
_ = tsplot(ma3, lags=12,title="MA(2) process")
In [18]:
# Simulate an ARMA(2, 2) model with alphas=[0.5,-0.25] and betas=[0.5,-0.3]
max_lag = 12

n = int(5000) # lots of samples to help estimates
burn = int(n/10) # number of samples to discard before fit

alphas = np.array([0.8, -0.65])
betas = np.array([0.5, -0.7])
ar = np.r_[1, -alphas]
ma = np.r_[1, betas]

arma22 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n, burnin=burn)
_ = tsplot(arma22, lags=max_lag,title="ARMA(2,2) process")
In [19]:
best_aic = np.inf 
best_order = None
best_mdl = None

rng = range(5)
for i in rng:
    for j in rng:
        try:
            tmp_mdl = smt.ARMA(new_ts.values, order=(i, j)).fit(method='mle', trend='nc')
            tmp_aic = tmp_mdl.aic
            if tmp_aic < best_aic:
                best_aic = tmp_aic
                best_order = (i, j)
                best_mdl = tmp_mdl
        except: continue


print('aic: {:6.5f} | order: {}'.format(best_aic, best_order))
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/tsa/kalmanf/kalmanfilter.py:646: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if issubdtype(paramsdtype, float):
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/tsa/kalmanf/kalmanfilter.py:650: FutureWarning: Conversion of the second argument of issubdtype from `complex` to `np.complexfloating` is deprecated. In future, it will be treated as `np.complex128 == np.dtype(complex).type`.
  elif issubdtype(paramsdtype, complex):
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/base/model.py:473: HessianInversionWarning: Inverting hessian failed, no bse or cov_params available
  'available', HessianInversionWarning)
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/base/model.py:496: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
aic: 720.66783 | order: (1, 0)
In [20]:
best_mdl.predict()
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/statsmodels/tsa/kalmanf/kalmanfilter.py:577: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  if issubdtype(paramsdtype, float):
Out[20]:
array([    0.        ,  1778.53507676,  -605.61278704, -2865.34674712,
        -216.81211508,  -598.66677807,   509.23347576, -1281.68370065,
         -11.70031517,    36.9090553 ,   195.78146932,  -987.66006889,
         299.96158034, -1894.79301356,  -679.83832361,  1899.23050877,
         375.39459226, -1239.52493599,  -439.05457981,  -355.08328032,
        1022.13024741,  -347.40747958, -4166.35678753,  2626.40037678,
       -3062.39564663, -1339.24433947,  1052.22198561, -3456.99899643,
         634.46760002,   719.81903981, -1077.7462644 ,  -781.9398454 ,
       -2165.72432306,  -516.03915567,  2162.30472554, -1762.24253824])
In [21]:
matplotlibdf = sales[['OrderDate', 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
linechartData = matplotlibdf.groupby([pd.Grouper(freq='MS',key='OrderDate')]).sum()
linechartData = linechartData.reset_index()


from fbprophet import Prophet
#prophet reqiures a pandas df at the below config 
# ( date column named as DS and the value column as Y)
linechartData.columns=['ds','y']
model = Prophet( yearly_seasonality=True) #instantiate Prophet with only yearly seasonality as our data is monthly 
model.fit(linechartData) #fit the model with your dataframe
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):
Out[21]:
<fbprophet.forecaster.Prophet at 0x10d79d518>
In [22]:
# predict for five months in the furure and MS - month start is the frequency
future = model.make_future_dataframe(periods = 5, freq = 'MS')  
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[22]:
ds yhat yhat_lower yhat_upper
48 2018-01-01 43323.456758 33590.528042 53421.294343
49 2018-02-01 33730.695324 24209.318809 43470.079569
50 2018-03-01 70788.907869 61229.822884 80527.948305
51 2018-04-01 54148.980838 43979.592412 63559.977517
52 2018-05-01 57739.010458 47370.724152 67603.601537
In [23]:
model.plot(forecast)
Out[23]:
In [24]:
model.plot_components(forecast)
Out[24]:
In [25]:
# let's try to make weekly predictions

matplotlibdf = sales[['OrderDate', 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
#matplotlibdf.reset_index().set_index('OrderDate').resample('D', how='sum')
linechartData = matplotlibdf.groupby([pd.Grouper(freq='D',key='OrderDate')]).sum().reset_index().set_index('OrderDate')
#linechartData.reset_index()
#linechartData.set_index('OrderDate')
#linechartData = linechartData.ffill().reset_index()
#linechartData
#linechartData.sort_values(by=['OrderDate'], ascending=[True])

idx = pd.date_range('01-03-2014', '12-30-2017')
linechartData = linechartData.reindex(idx, fill_value=0)
linechartData = linechartData.reset_index()

from fbprophet import Prophet
#prophet reqiures a pandas df at the below config 
# ( date column named as DS and the value column as Y)
linechartData.columns=['ds','y']
model = Prophet( yearly_seasonality=True) #instantiate Prophet with only yearly seasonality as our data is monthly 
model.fit(linechartData) #fit the model with your dataframe

future = model.make_future_dataframe(periods = 30, freq = 'D')  
# now lets make the forecasts
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
model.plot(forecast)
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):
Out[25]:
In [26]:
def me(df):
    return((df['yhat'] - df['y']).sum()/len(df['yhat']))
def mse(df):
    return((df['yhat'] - df['y']).pow(2).sum()/len(df))
def rmse(df):
    return(np.sqrt((df['yhat'] - df['y']).pow(2).sum()/len(df)))
def mae(df):
    return((df['yhat'] - df['y']).abs().sum()/len(df))
def mpe(df):
    return((df['yhat'] - df['y']).div(df['y']).sum()*(1/len(df)))
def mape(df):
    return((df['yhat'] - df['y']).div(df['y']).abs().sum()*(1/len(df)))

def all_metrics(model, df_cv = None):
    """Compute model fit metrics for time series.
    Computes the following metrics about each time series that has been through 
    Cross Validation;
    Mean Error (ME)
    Mean Squared Error (MSE)
    Root Mean Square Error (RMSE,
    Mean Absolute Error (MAE)
    Mean Percentage Error (MPE)
    Mean Absolute Percentage Error (MAPE)
    Parameters
    ----------
    df: A pandas dataframe. Contains y and yhat produced by cross-validation
    Returns
    -------
    A dictionary where the key = the error type, and value is the value of the error
    """

    

    df = []

    if df_cv is not None:
        df = df_cv
    else:
        # run a forecast on your own data with period = 0 so that it is in-sample data onlyl
        #df = model.predict(model.make_future_dataframe(periods=0))[['y', 'yhat']]
        df = (model
                .history[['ds', 'y']]
                .merge(
                    model.predict(model.make_future_dataframe(periods=0))[['ds', 'yhat']], 
                    how='inner', on='ds'
                    )
                )

    if 'yhat' not in df.columns:
        raise ValueError(
            'Please run Cross-Validation first before computing quality metrics.')

    return {
            'ME':me(df),
            'MSE':mse(df), 
            'RMSE': rmse(df), 
            'MAE': mae(df), 
            'MPE': mpe(df), 
            'MAPE': mape(df)
            }
In [27]:
matplotlibdf = sales[['OrderDate', 'Customer ID' , 'Sales']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
f = {'Sales':['sum', 'mean', 'count'], 'Customer ID':['nunique']}
linechartData = matplotlibdf.groupby([pd.Grouper(freq='D',key='OrderDate')]).agg(f)
linechartData['AvgSalePerCustomer'] = linechartData['Sales']['sum']/linechartData['Customer ID']['nunique']
print(linechartData)
                Sales                   Customer ID AvgSalePerCustomer
                  sum        mean count     nunique                   
OrderDate                                                             
2014-01-03    16.4480   16.448000     1           1          16.448000
2014-01-04   288.0600   96.020000     3           1         288.060000
2014-01-05    19.5360   19.536000     1           1          19.536000
2014-01-06  4407.1000  489.677778     9           3        1469.033333
2014-01-07    87.1580   43.579000     2           1          87.158000
2014-01-08     0.0000         NaN     0           0                NaN
2014-01-09    40.5440   20.272000     2           1          40.544000
2014-01-10    54.8300   27.415000     2           1          54.830000
2014-01-11     9.9400    9.940000     1           1           9.940000
2014-01-12     0.0000         NaN     0           0                NaN
2014-01-13  3553.7950  323.072273    11           4         888.448750
2014-01-14    61.9600   61.960000     1           1          61.960000
2014-01-15   149.9500  149.950000     1           1         149.950000
2014-01-16   299.9640   74.991000     4           1         299.964000
2014-01-17     0.0000         NaN     0           0                NaN
2014-01-18    64.8640   64.864000     1           1          64.864000
2014-01-19   378.5940   94.648500     4           1         378.594000
2014-01-20  2673.8700  157.286471    17           4         668.467500
2014-01-21    25.2480   25.248000     1           1          25.248000
2014-01-22     0.0000         NaN     0           0                NaN
2014-01-23    46.0200   23.010000     2           2          23.010000
2014-01-24     0.0000         NaN     0           0                NaN
2014-01-25     0.0000         NaN     0           0                NaN
2014-01-26  1097.2500  121.916667     9           2         548.625000
2014-01-27   426.6700  142.223333     3           1         426.670000
2014-01-28     3.9280    3.928000     1           1           3.928000
2014-01-29     0.0000         NaN     0           0                NaN
2014-01-30   240.5000  120.250000     2           1         240.500000
2014-01-31   290.6660  290.666000     1           1         290.666000
2014-02-01   468.9000  468.900000     1           1         468.900000
...               ...         ...   ...         ...                ...
2017-12-01  5331.1780  156.799353    34          14         380.798429
2017-12-02  9951.1820  292.681824    34          16         621.948875
2017-12-03  1403.8420   70.192100    20           8         175.480250
2017-12-04  2639.6380  146.646556    18          10         263.963800
2017-12-05  1453.1360   76.480842    19           8         181.642000
2017-12-06    10.6800   10.680000     1           1          10.680000
2017-12-07  2916.5140  324.057111     9           7         416.644857
2017-12-08  7643.0410  254.768033    30           9         849.226778
2017-12-09  5470.3900  165.769394    33          15         364.692667
2017-12-10  3873.5590  184.455190    21          11         352.141727
2017-12-11  2823.9650  128.362045    22          11         256.724091
2017-12-12     0.0000         NaN     0           0                NaN
2017-12-13   580.9360   96.822667     6           3         193.645333
2017-12-14  3897.7140  216.539667    18           6         649.619000
2017-12-15   306.8880   61.377600     5           3         102.296000
2017-12-16   858.7020   66.054000    13           5         171.740400
2017-12-17  2027.7580  155.981385    13           8         253.469750
2017-12-18  3645.9110  182.295550    20           9         405.101222
2017-12-19  1895.9260  379.185200     5           3         631.975333
2017-12-20   377.7360   75.547200     5           2         188.868000
2017-12-21  2140.9400  194.630909    11           8         267.617500
2017-12-22  7442.0210  275.630407    27          11         676.547364
2017-12-23  1926.7760  128.451733    15           7         275.253714
2017-12-24  6233.0540  389.565875    16          11         566.641273
2017-12-25  2698.9270  117.344652    23          10         269.892700
2017-12-26   814.5940  203.648500     4           4         203.648500
2017-12-27   177.6360   88.818000     2           1         177.636000
2017-12-28  1657.3508   87.228989    19          10         165.735080
2017-12-29  2915.5340  242.961167    12           6         485.922333
2017-12-30   713.7900  101.970000     7           4         178.447500

[1458 rows x 5 columns]
In [28]:
matplotlibdf = sales[['OrderDate', 'Customer ID' , 'Sales', 'City']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
f = {'Sales':['sum'], 'Customer ID':['nunique']}
linechartData = matplotlibdf.groupby('City').agg(f)
linechartData['AvgSalePerCustomer'] = linechartData['Sales']['sum']/linechartData['Customer ID']['nunique']
print(linechartData.sort_values('AvgSalePerCustomer', ascending=False))
                        Sales Customer ID AvgSalePerCustomer
                          sum     nunique                   
City                                                        
Burbank             3247.1580           1        3247.158000
Beverly             2583.1900           1        2583.190000
Jamestown           4708.7900           2        2354.395000
San Gabriel         2061.0100           1        2061.010000
Bellingham          3790.2400           2        1895.120000
Round Rock          4854.0528           3        1618.017600
Cheyenne            1603.1360           1        1603.136000
Torrance            3132.2680           2        1566.134000
Passaic             1562.0860           1        1562.086000
Burlington         21668.0820          14        1547.720143
Noblesville         3091.1800           2        1545.590000
Great Falls         4605.5800           3        1535.193333
Lafayette          25036.2000          17        1472.717647
Eugene              1465.0320           1        1465.032000
North Las Vegas     9801.0020           7        1400.143143
Mobile              5462.9900           4        1365.747500
Madison             5346.7900           4        1336.697500
Saint Cloud         1331.8200           1        1331.820000
Carol Stream        1305.8100           1        1305.810000
Kenosha             3906.7300           3        1302.243333
Minneapolis        16870.5400          13        1297.733846
Buffalo             9063.4960           7        1294.785143
Richardson          1288.4640           1        1288.464000
Yonkers             7657.6660           6        1276.277667
Renton              1242.6320           1        1242.632000
South Bend          1238.3300           1        1238.330000
Providence         15980.6500          13        1229.280769
Independence        2417.3700           2        1208.685000
Twin Falls          1148.8060           1        1148.806000
Rio Rancho          1114.1920           1        1114.192000
...                       ...         ...                ...
Nashua                35.9400           2          17.970000
Royal Oak             35.3400           2          17.670000
Norfolk               17.4300           1          17.430000
Mesquite              52.1480           3          17.382667
Portage               16.2800           1          16.280000
Grand Island          15.9600           1          15.960000
Margate               15.5520           1          15.552000
New Brunswick         14.7700           1          14.770000
Arlington Heights     14.1120           1          14.112000
Chapel Hill           14.0160           1          14.016000
Rock Hill             11.8500           1          11.850000
North Miami           22.1280           2          11.064000
Baytown               10.3680           1          10.368000
Iowa City              9.9900           1           9.990000
Cuyahoga Falls        29.0940           3           9.698000
Romeoville             8.9520           1           8.952000
Billings               8.2880           1           8.288000
Port Orange            7.8240           1           7.824000
Loveland              20.9640           3           6.988000
Deer Park              6.9240           1           6.924000
Missouri City          6.3700           1           6.370000
Keller                 6.0000           1           6.000000
Layton                 4.9600           1           4.960000
Springdale             4.3000           1           4.300000
San Luis Obispo        3.6200           1           3.620000
Ormond Beach           2.8080           1           2.808000
Pensacola              2.2140           1           2.214000
Jupiter                2.0640           1           2.064000
Elyria                 1.8240           1           1.824000
Abilene                1.3920           1           1.392000

[531 rows x 3 columns]
In [29]:
linechartData['AvgSalePerCustomer'].describe()
Out[29]:
count     531.000000
mean      395.867213
std       393.339737
min         1.392000
25%       121.184000
50%       295.526000
75%       533.663500
max      3247.158000
Name: AvgSalePerCustomer, dtype: float64
In [30]:
# ECDF: empirical cumulative distribution function

from statsmodels.distributions.empirical_distribution import ECDF

sns.set(style = "ticks")# to format into seaborn 
c = '#386B7F' # basic color for plots
plt.figure(figsize = (12, 6))
plt.subplots_adjust(hspace=.5)

plt.subplot(311)
cdf = ECDF(linechartData['Sales']['sum'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Sales'); plt.ylabel('ECDF');



# plot second ECDF  
plt.subplot(312)
cdf = ECDF(linechartData['Customer ID']['nunique'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Customers');

# plot second ECDF  
plt.subplot(313)
cdf = ECDF(linechartData['AvgSalePerCustomer'])
plt.plot(cdf.x, cdf.y, label = "statmodels", color = c);
plt.xlabel('Sale per Customer');
In [31]:
matplotlibdf = sales[['OrderDate', 'Sub-Category' , 'Sales', 'Customer ID', 'Profit']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf.groupby('Sub-Category')['Sales'].describe()
Out[31]:
count mean std min 25% 50% 75% max
Sub-Category
Accessories 775.0 215.974604 334.965015 0.990 47.9760 100.00000 239.9600 3347.370
Appliances 466.0 230.755710 388.949643 0.444 35.0550 82.69000 241.4400 2625.120
Art 796.0 34.068834 60.122465 1.344 7.9800 15.50400 36.4400 1113.024
Binders 1523.0 133.560560 563.251188 0.556 8.1170 18.56000 51.8485 9892.740
Bookcases 228.0 503.859633 638.748523 35.490 190.5000 306.82025 522.2850 4404.900
Chairs 617.0 532.332420 550.148243 26.640 191.9600 362.13600 662.8800 4416.174
Copiers 68.0 2198.941618 3175.665867 299.990 599.9875 1099.98000 2399.9600 17499.950
Envelopes 254.0 64.867724 84.351633 1.632 15.2575 29.04600 71.9475 604.656
Fasteners 217.0 13.936774 12.416593 1.240 5.6800 10.58400 17.9000 93.360
Furnishings 957.0 95.825668 147.893640 1.892 19.3000 41.96000 106.6800 1336.440
Labels 364.0 34.303055 74.119287 2.088 8.2600 14.94000 29.2400 786.480
Machines 115.0 1645.553313 2765.102088 11.560 287.9390 599.98500 2120.9380 22638.480
Paper 1370.0 57.284092 78.167639 3.380 14.5120 26.72000 61.9600 733.950
Phones 889.0 371.211534 491.457343 2.970 84.7840 209.97000 475.9440 4548.810
Storage 846.0 264.590553 355.222507 4.464 46.5300 113.92800 340.2000 2934.330
Supplies 190.0 245.650200 923.828753 1.744 12.1245 27.93000 55.4920 8187.650
Tables 319.0 648.794771 615.774655 24.368 244.0060 447.84000 872.1700 4297.644
In [32]:
f = {'Sales':['sum'], 'Customer ID':['nunique'], 'Profit':['sum']}
matplotlibdf.groupby('Sub-Category').agg(f)
Out[32]:
Sales Customer ID Profit
sum nunique sum
Sub-Category
Accessories 167380.3180 474 41936.6357
Appliances 107532.1610 356 18138.0054
Art 27118.7920 494 6527.7870
Binders 203412.7330 650 30221.7633
Bookcases 114879.9963 195 -3472.5560
Chairs 328449.1030 407 26590.1663
Copiers 149528.0300 64 55617.8249
Envelopes 16476.4020 206 6964.1767
Fasteners 3024.2800 191 949.5182
Furnishings 91705.1640 528 13059.1436
Labels 12486.3120 281 5546.2540
Machines 189238.6310 99 3384.7569
Paper 78479.2060 611 34053.5693
Phones 330007.0540 511 44515.7306
Storage 223843.6080 514 21278.8264
Supplies 46673.5380 160 -1189.0995
Tables 206965.5320 261 -17725.4811
In [33]:
def f(row):
    if float(row['Discount']) > 0:
        return 1
    else:
        return 0

matplotlibdf = sales[['OrderDate', 'Category' , 'Sales', 'Discount', 'Region', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
print(matplotlibdf)
      OrderDate         Category      Sales  Discount   Region Customer ID  \
0    2016-11-08        Furniture   261.9600      0.00    South    CG-12520   
1    2016-11-08        Furniture   731.9400      0.00    South    CG-12520   
2    2016-06-12  Office Supplies    14.6200      0.00     West    DV-13045   
3    2015-10-11        Furniture   957.5775      0.45    South    SO-20335   
4    2015-10-11  Office Supplies    22.3680      0.20    South    SO-20335   
5    2014-06-09        Furniture    48.8600      0.00     West    BH-11710   
6    2014-06-09  Office Supplies     7.2800      0.00     West    BH-11710   
7    2014-06-09       Technology   907.1520      0.20     West    BH-11710   
8    2014-06-09  Office Supplies    18.5040      0.20     West    BH-11710   
9    2014-06-09  Office Supplies   114.9000      0.00     West    BH-11710   
10   2014-06-09        Furniture  1706.1840      0.20     West    BH-11710   
11   2014-06-09       Technology   911.4240      0.20     West    BH-11710   
12   2017-04-15  Office Supplies    15.5520      0.20    South    AA-10480   
13   2016-12-05  Office Supplies   407.9760      0.20     West    IM-15070   
14   2015-11-22  Office Supplies    68.8100      0.80  Central    HP-14815   
15   2015-11-22  Office Supplies     2.5440      0.80  Central    HP-14815   
16   2014-11-11  Office Supplies   665.8800      0.00  Central    PK-19075   
17   2014-05-13  Office Supplies    55.5000      0.00     West    AG-10270   
18   2014-08-27  Office Supplies     8.5600      0.00     West    ZD-21925   
19   2014-08-27       Technology   213.4800      0.20     West    ZD-21925   
20   2014-08-27  Office Supplies    22.7200      0.20     West    ZD-21925   
21   2016-12-09  Office Supplies    19.4600      0.00  Central    KB-16585   
22   2016-12-09  Office Supplies    60.3400      0.00  Central    KB-16585   
23   2017-07-16        Furniture    71.3720      0.30     East    SF-20065   
24   2015-09-25        Furniture  1044.6300      0.00     West    EB-13870   
25   2016-01-16  Office Supplies    11.6480      0.20     West    EH-13945   
26   2016-01-16       Technology    90.5700      0.00     West    EH-13945   
27   2015-09-17        Furniture  3083.4300      0.50     East    TB-21520   
28   2015-09-17  Office Supplies     9.6180      0.70     East    TB-21520   
29   2015-09-17        Furniture   124.2000      0.20     East    TB-21520   
...         ...              ...        ...       ...      ...         ...   
9964 2016-12-05        Furniture    13.4000      0.00     East    HE-14800   
9965 2016-12-05  Office Supplies     4.9800      0.00     East    HE-14800   
9966 2016-12-05  Office Supplies   109.6900      0.00     East    HE-14800   
9967 2017-12-11  Office Supplies    40.2000      0.00     East    RB-19435   
9968 2017-12-11  Office Supplies   735.9800      0.00     East    RB-19435   
9969 2017-12-11  Office Supplies    22.7500      0.00     East    RB-19435   
9970 2015-06-28  Office Supplies   119.5600      0.00    South    MP-17470   
9971 2015-06-28  Office Supplies   140.7500      0.00    South    MP-17470   
9972 2016-09-11  Office Supplies    99.5680      0.20  Central    RC-19960   
9973 2016-12-06       Technology   271.9600      0.20     West    AP-10720   
9974 2016-12-06  Office Supplies    18.6900      0.00     West    AP-10720   
9975 2016-12-06  Office Supplies    13.3600      0.00     West    AP-10720   
9976 2016-12-06       Technology   249.5840      0.20     West    AP-10720   
9977 2016-12-06  Office Supplies    13.8600      0.00     West    AP-10720   
9978 2016-12-06  Office Supplies    13.3760      0.20     West    AP-10720   
9979 2016-12-06  Office Supplies   437.4720      0.20     West    AP-10720   
9980 2015-09-06        Furniture    85.9800      0.00    South    SW-20455   
9981 2017-08-03  Office Supplies    16.5200      0.20     East    TB-21055   
9982 2016-09-22  Office Supplies    35.5600      0.00  Central    RC-19960   
9983 2016-09-22       Technology    97.9800      0.00  Central    RC-19960   
9984 2015-05-17  Office Supplies    31.5000      0.00     East    DV-13465   
9985 2015-05-17  Office Supplies    55.6000      0.00     East    DV-13465   
9986 2016-09-29       Technology    36.2400      0.00     West    ML-17410   
9987 2017-11-17       Technology    79.9900      0.00    South    RA-19885   
9988 2017-11-17       Technology   206.1000      0.00    South    RA-19885   
9989 2014-01-21        Furniture    25.2480      0.20    South    TB-21400   
9990 2017-02-26        Furniture    91.9600      0.00     West    DB-13060   
9991 2017-02-26       Technology   258.5760      0.20     West    DB-13060   
9992 2017-02-26  Office Supplies    29.6000      0.00     West    DB-13060   
9993 2017-05-04  Office Supplies   243.1600      0.00     West    CC-12220   

      is_discount  month  weekday  
0               0     11        1  
1               0     11        1  
2               0      6        6  
3               1     10        6  
4               1     10        6  
5               0      6        0  
6               0      6        0  
7               1      6        0  
8               1      6        0  
9               0      6        0  
10              1      6        0  
11              1      6        0  
12              1      4        5  
13              1     12        0  
14              1     11        6  
15              1     11        6  
16              0     11        1  
17              0      5        1  
18              0      8        2  
19              1      8        2  
20              1      8        2  
21              0     12        4  
22              0     12        4  
23              1      7        6  
24              0      9        4  
25              1      1        5  
26              0      1        5  
27              1      9        3  
28              1      9        3  
29              1      9        3  
...           ...    ...      ...  
9964            0     12        0  
9965            0     12        0  
9966            0     12        0  
9967            0     12        0  
9968            0     12        0  
9969            0     12        0  
9970            0      6        6  
9971            0      6        6  
9972            1      9        6  
9973            1     12        1  
9974            0     12        1  
9975            0     12        1  
9976            1     12        1  
9977            0     12        1  
9978            1     12        1  
9979            1     12        1  
9980            0      9        6  
9981            1      8        3  
9982            0      9        3  
9983            0      9        3  
9984            0      5        6  
9985            0      5        6  
9986            0      9        3  
9987            0     11        4  
9988            0     11        4  
9989            1      1        1  
9990            0      2        6  
9991            1      2        6  
9992            0      2        6  
9993            0      5        3  

[9994 rows x 9 columns]
In [34]:
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales", 
               col = 'Region', # per store type in cols
               palette = 'plasma',
               hue = 'Region',
               row = 'is_discount', # per promo in the store in rows
               color = c,
               estimator=sum) 

# Show point estimates and confidence intervals using scatter plot glyphs.
# A point plot represents an estimate of central tendency for a numeric variable by the 
# position of scatter plot points and provides some indication of the uncertainty around that 
# bestimate using error bars.
Out[34]:
<seaborn.axisgrid.FacetGrid at 0x115629208>
In [35]:
sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales", 
               col = 'weekday', # per store type in cols
               palette = 'plasma',
               hue = 'Region',
               row = 'Region', # per store type in rows
               color = c,
               estimator=sum) 
Out[35]:
<seaborn.axisgrid.FacetGrid at 0x11036d320>
In [36]:
def f(row):
    if float(row['Discount']) > 0:
        return 1
    else:
        return 0

matplotlibdf = sales[['OrderDate', 'Segment' , 'Sales', 'Discount', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday


sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales", 
               col = 'Segment', # per store type in cols
               palette = 'plasma',
               hue = 'Segment',
               row = 'is_discount', # per promo in the store in rows
               color = c,
               estimator=sum) 
Out[36]:
<seaborn.axisgrid.FacetGrid at 0x11a3166a0>
In [37]:
def f(row):
    if float(row['Discount']) > 0:
        return 1
    else:
        return 0

matplotlibdf = sales[['OrderDate', 'Sub-Category' , 'Sales', 'Discount', 'Customer ID']].copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday


sns.factorplot(data = matplotlibdf, x = 'month', y = "Sales", 
               col = 'Sub-Category', # per store type in cols
               palette = 'plasma',
               hue = 'Sub-Category',
               row = 'is_discount', # per promo in the store in rows
               color = c,
               estimator=sum) 
Out[37]:
<seaborn.axisgrid.FacetGrid at 0x11b278eb8>

Let's see if we can find any correlation between the columns

In [38]:
def f(row):
    if float(row['Discount']) > 0:
        return 1
    else:
        return 0

matplotlibdf = sales.copy()
matplotlibdf['OrderDate'] = pd.to_datetime(matplotlibdf['OrderDate'], format='%m/%d/%y')
matplotlibdf['is_discount'] = matplotlibdf.apply(f, axis=1)
matplotlibdf['month'] = matplotlibdf['OrderDate'].dt.month
matplotlibdf['weekday'] = matplotlibdf['OrderDate'].dt.weekday
corr_all = matplotlibdf.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr_all, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize = (11, 9))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_all, mask = mask,
            square = True, linewidths = .5, ax = ax, cmap = "BuPu")      
#plt.show()
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1210b5128>

TakeAways

  • Discounts make difference in West Region. In other regions, discount aren't useful
  • Giving discounts to customers in Home Office Segment is very helpful.
  • People buy chairs mostly in discount
  • Chairs and Tables are sold around year end
  • Discount doesn't matter in envelopes, fastners, labels and Art.
  • Cities like Burlington, Lafayette, Minneapolis, Providence have lot of customers and good AvgSalesPerCustomer. To benefit from this fact, Store can consider proposing bigger variety of its products.

Sales prediction on Rosmann Data

In [39]:
# store data
store = pd.read_csv("store.csv", 
                    low_memory = False)
  • Id - an Id that represents a (Store, Date) duple within the test set
  • Store - a unique Id for each store
  • Sales - the turnover for any given day (this is what you are predicting)
  • Customers - the number of customers on a given day
  • Open - an indicator for whether the store was open: 0 = closed, 1 = open
  • StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
  • SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
  • StoreType - differentiates between 4 different store models: a, b, c, d
  • Assortment - describes an assortment level: a = basic, b = extra, c = extended
  • CompetitionDistance - distance in meters to the nearest competitor store
  • CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
  • Promo - indicates whether a store is running a promo on that day
  • Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
  • Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
  • PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started - anew. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store
In [40]:
store.isnull().sum()
Out[40]:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64
In [41]:
store['CompetitionDistance'].fillna(store['CompetitionDistance'].median(), inplace = True)
store.fillna(0, inplace = True)
In [42]:
# importing data
df = pd.read_csv("train.csv",low_memory = False)

df.info()

# remove closed stores and those with no sales
df = df[(df["Open"] != 0) & (df['Sales'] != 0)]

# sales for the store number 1 (StoreType C)
sales = df[df.Store == 1].loc[:, ['Date', 'Sales']]

# reverse to the order: from 2013 to 2015
sales = sales.sort_index(ascending = False)

# to datetime64
sales['Date'] = pd.DatetimeIndex(sales['Date'])
sales.dtypes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
Store            1017209 non-null int64
DayOfWeek        1017209 non-null int64
Date             1017209 non-null object
Sales            1017209 non-null int64
Customers        1017209 non-null int64
Open             1017209 non-null int64
Promo            1017209 non-null int64
StateHoliday     1017209 non-null object
SchoolHoliday    1017209 non-null int64
dtypes: int64(7), object(2)
memory usage: 69.8+ MB
Out[42]:
Date     datetime64[ns]
Sales             int64
dtype: object
In [43]:
# from the prophet documentation every variables should have specific names
sales = sales.rename(columns = {'Date': 'ds',
                                'Sales': 'y'})
sales.head()
Out[43]:
ds y
1014980 2013-01-02 5530
1013865 2013-01-03 4327
1012750 2013-01-04 4486
1011635 2013-01-05 4997
1009405 2013-01-07 7176
In [44]:
# plot daily sales
ax = sales.set_index('ds').plot(figsize = (12, 4), color = c)
ax.set_ylabel('Daily Number of Sales')
ax.set_xlabel('Date')
plt.show()
In [45]:
# create holidays dataframe
state_dates = df[(df.StateHoliday == 'a') | (df.StateHoliday == 'b') & (df.StateHoliday == 'c')].loc[:, 'Date'].values
school_dates = df[df.SchoolHoliday == 1].loc[:, 'Date'].values

state = pd.DataFrame({'holiday': 'state-holiday',
                      'ds': pd.to_datetime(state_dates)})
school = pd.DataFrame({'holiday': 'school-holiday',
                      'ds': pd.to_datetime(school_dates)})

holidays = pd.concat((state, school))      
holidays
Out[45]:
ds holiday
0 2015-06-04 state-holiday
1 2015-06-04 state-holiday
2 2015-06-04 state-holiday
3 2015-06-04 state-holiday
4 2015-06-04 state-holiday
5 2015-06-04 state-holiday
6 2015-06-04 state-holiday
7 2015-06-04 state-holiday
8 2015-06-04 state-holiday
9 2015-06-04 state-holiday
10 2015-06-04 state-holiday
11 2015-06-04 state-holiday
12 2015-06-04 state-holiday
13 2015-06-04 state-holiday
14 2015-06-04 state-holiday
15 2015-06-04 state-holiday
16 2015-06-04 state-holiday
17 2015-06-04 state-holiday
18 2015-06-04 state-holiday
19 2015-06-04 state-holiday
20 2015-06-04 state-holiday
21 2015-06-04 state-holiday
22 2015-06-04 state-holiday
23 2015-06-04 state-holiday
24 2015-06-04 state-holiday
25 2015-06-04 state-holiday
26 2015-06-04 state-holiday
27 2015-06-04 state-holiday
28 2015-06-04 state-holiday
29 2015-06-04 state-holiday
... ... ...
163415 2013-01-02 school-holiday
163416 2013-01-02 school-holiday
163417 2013-01-02 school-holiday
163418 2013-01-02 school-holiday
163419 2013-01-02 school-holiday
163420 2013-01-02 school-holiday
163421 2013-01-02 school-holiday
163422 2013-01-02 school-holiday
163423 2013-01-02 school-holiday
163424 2013-01-02 school-holiday
163425 2013-01-02 school-holiday
163426 2013-01-02 school-holiday
163427 2013-01-02 school-holiday
163428 2013-01-01 school-holiday
163429 2013-01-01 school-holiday
163430 2013-01-01 school-holiday
163431 2013-01-01 school-holiday
163432 2013-01-01 school-holiday
163433 2013-01-01 school-holiday
163434 2013-01-01 school-holiday
163435 2013-01-01 school-holiday
163436 2013-01-01 school-holiday
163437 2013-01-01 school-holiday
163438 2013-01-01 school-holiday
163439 2013-01-01 school-holiday
163440 2013-01-01 school-holiday
163441 2013-01-01 school-holiday
163442 2013-01-01 school-holiday
163443 2013-01-01 school-holiday
163444 2013-01-01 school-holiday

164139 rows × 2 columns

In [46]:
model = Prophet(interval_width = 0.95, holidays = holidays)
model.fit(sales)

# dataframe that extends into future 6 weeks 
future_dates = model.make_future_dataframe(periods = 6*7)

print("First week to forecast.")
future_dates.tail(7)
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
First week to forecast.
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/pystan/misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  elif np.issubdtype(np.asarray(v).dtype, float):
Out[46]:
ds
816 2015-09-05
817 2015-09-06
818 2015-09-07
819 2015-09-08
820 2015-09-09
821 2015-09-10
822 2015-09-11
In [47]:
# predictions
forecast = model.predict(future_dates)

# preditions for last week
# forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(7)
model.plot(forecast);
In [48]:
model.plot_components(forecast);
In [49]:
all_metrics(model)
Out[49]:
{'ME': -0.18364937928120184,
 'MSE': 693623.275330607,
 'RMSE': 832.840486126009,
 'MAE': 674.4633013631615,
 'MPE': 0.030687559859972784,
 'MAPE': 0.1475343174923204}

The first plot shows that the monthly sales of store number 1 has been linearly decreasing over time and the second shows the holiays gaps included in the model. The third plot highlights the fact that the weekly volume of last week sales peaks towards the Monday of the next week, while the forth plot shows that the most buzy season occurs during the Christmas holidays.

Prophet only requires ds and y to predict future values. Let's now perform multiple regression

In [121]:
df.head()
Out[121]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday
0 1 5 2015-07-31 5263 555 1 1 0 1
1 2 5 2015-07-31 6064 625 1 1 0 1
2 3 5 2015-07-31 8314 821 1 1 0 1
3 4 5 2015-07-31 13995 1498 1 1 0 1
4 5 5 2015-07-31 4822 559 1 1 0 1
In [122]:
train = pd.read_csv("train.csv",parse_dates = True, index_col = 'Date')
train['Year'] = train.index.year
train['Month'] = train.index.month
train['Day'] = train.index.day
train['WeekOfYear'] = train.index.weekofyear
train = train.reset_index()
df_new = pd.merge(train, store, how='left', on='Store')
df_new.columns.values
/Users/vijaynandwani/anaconda3/envs/pallavi/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[122]:
array(['Date', 'Store', 'DayOfWeek', 'Sales', 'Customers', 'Open',
       'Promo', 'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Day',
       'WeekOfYear', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'],
      dtype=object)
In [123]:
df_new = df_new.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2SinceWeek',
                     'Promo2SinceYear', 'PromoInterval'], axis=1)
In [124]:
df_new.head()
Out[124]:
Date Store DayOfWeek Sales Customers Open Promo StateHoliday SchoolHoliday Year Month Day WeekOfYear StoreType Assortment CompetitionDistance Promo2
0 2015-07-31 1 5 5263 555 1 1 0 1 2015 7 31 31 c a 1270.0 0
1 2015-07-31 2 5 6064 625 1 1 0 1 2015 7 31 31 a a 570.0 1
2 2015-07-31 3 5 8314 821 1 1 0 1 2015 7 31 31 a a 14130.0 1
3 2015-07-31 4 5 13995 1498 1 1 0 1 2015 7 31 31 c c 620.0 0
4 2015-07-31 5 5 4822 559 1 1 0 1 2015 7 31 31 a a 29910.0 0
In [125]:
df_new = df_new[df_new.Open != 0]
df_new = df_new.drop('Open', axis=1) #it's useless now
df_new = df_new[df_new.Sales != 0]
In [126]:
# Converting state holiday to string as other values are in string (a,b,c)
df_new.loc[df_new.StateHoliday == 0,'StateHoliday'] = df_new.loc[df_new.StateHoliday == 0,'StateHoliday'].astype(str)
In [127]:
# calculate weekly average sales
sales = df_new[['Year','Month','Store','Sales']].groupby(['Year','Month','Store']).mean()
sales = sales.rename(columns={'Sales':'AvgSales'})
sales = sales.reset_index()
sales.head()
Out[127]:
Year Month Store AvgSales
0 2013 1 1 4939.653846
1 2013 1 2 4429.653846
2 2013 1 3 6371.269231
3 2013 1 4 9027.423077
4 2013 1 5 4209.307692
In [128]:
# Time to merge them!

df_new['sales_key']=df_new['Year'].map(str) + df_new['Month'].map(str) + df_new['Store'].map(str)
sales['sales_key']=sales['Year'].map(str) + sales['Month'].map(str) + sales['Store'].map(str)
In [129]:
# drop extra columns
sales = sales.drop(['Year','Month','Store'], axis=1)
# merge
df_new = pd.merge(df_new, sales, how='left', on=('sales_key'))
In [130]:
# Avg number of customers

cust = df_new[['Year','Month','Store','Customers']].groupby(['Year','Month', 'Store']).mean()
cust = cust.rename(columns={'Customers':'AvgCustomer'})
cust = cust.reset_index()
df_new['cust_key']=df_new['Year'].map(str) + df_new['Month'].map(str) + df_new['Store'].map(str)
cust['cust_key']=cust['Year'].map(str) + cust['Month'].map(str) + cust['Store'].map(str)
In [131]:
df_new = df_new.drop('Customers', axis=1)# drop extra columns
cust = cust.drop(['Year', 'Month', 'Store'], axis=1)
In [132]:
df_new = pd.merge(df_new, cust, how="left", on=('cust_key'))
In [133]:
df_new['StateHoliday'] = df_new.StateHoliday.map({'0':0, 'a':1 ,'b' : 1,'c': 1})
In [134]:
df_new = df_new.drop(['cust_key','sales_key','Store', 'Date'], axis=1)
In [135]:
X = df_new.drop('Sales', axis=1)
y = df_new.Sales
In [136]:
xd = X.copy()
xd = pd.get_dummies(xd)
xd.head()
Out[136]:
DayOfWeek Promo StateHoliday SchoolHoliday Year Month Day WeekOfYear CompetitionDistance Promo2 AvgSales AvgCustomer StoreType_a StoreType_b StoreType_c StoreType_d Assortment_a Assortment_b Assortment_c
0 5 1 0 1 2015 7 31 31 1270.0 0 4491.333333 519.407407 0 0 1 0 1 0 0
1 5 1 0 1 2015 7 31 31 570.0 1 4954.259259 621.222222 1 0 0 0 1 0 0
2 5 1 0 1 2015 7 31 31 14130.0 1 6797.592593 682.888889 1 0 0 0 1 0 0
3 5 1 0 1 2015 7 31 31 620.0 0 10256.851852 1294.259259 0 0 1 0 0 0 1
4 5 1 0 1 2015 7 31 31 29910.0 0 4599.629630 521.703704 1 0 0 0 1 0 0
In [137]:
# label nominal variables for tree based regression
xl = X.copy()

from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
xl.StateHoliday = label.fit_transform(xl.StateHoliday)
xl.Assortment = label.fit_transform(xl.Assortment)
xl.StoreType = label.fit_transform(xl.StoreType)

# train test split

# split training and test datasets
from sklearn.cross_validation import train_test_split
xd_train,xd_test,yd_train,yd_test = train_test_split(xd,y,test_size=0.3, random_state=1)
xl_train,xl_test,yl_train,yl_test = train_test_split(xl,y,test_size=0.3, random_state=1)

xd_train.head()
Out[137]:
DayOfWeek Promo StateHoliday SchoolHoliday Year Month Day WeekOfYear CompetitionDistance Promo2 AvgSales AvgCustomer StoreType_a StoreType_b StoreType_c StoreType_d Assortment_a Assortment_b Assortment_c
214809 4 0 0 0 2014 12 11 50 35280.0 0 5355.555556 551.296296 1 0 0 0 0 0 1
598561 3 1 0 0 2013 11 6 45 30360.0 0 6658.961538 566.769231 1 0 0 0 0 0 1
736221 6 0 0 0 2013 6 15 24 220.0 1 11358.960000 1388.320000 1 0 0 0 0 0 1
542693 6 0 0 0 2013 12 21 51 2930.0 0 7701.000000 678.458333 0 0 0 1 1 0 0
332170 1 1 0 1 2014 7 28 31 50.0 0 6294.370370 513.888889 0 0 0 1 0 0 1

Linear Regression

In [220]:
from sklearn.linear_model import LinearRegression
lin= LinearRegression()
linreg = lin.fit(xd_train, yd_train)
In [221]:
# definte RMSE function
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(x, y):
    return sqrt(mean_squared_error(x, y))

# definte MAPE function
def mape(x, y): 
    return np.mean(np.abs((x - y) / x)) * 100  
  
# get cross validation scores 
yd_predicted = linreg.predict(xd_train)
yd_test_predicted = linreg.predict(xd_test)

linear_train_score = linreg.score(xd_train, yd_train)
linear_test_score = linreg.score(xd_test, yd_test)
linear_train_mape = mape(yd_train, yd_predicted)
linear_test_mape = mape(yd_test, yd_test_predicted)
print("Regresion Model Score" , ":" , linear_train_score , "," , 
      "Out of Sample Test Score" ,":" , linear_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
      "Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", linear_train_mape,
      "Testing MAPE", ":", linear_test_mape)
Regresion Model Score : 0.7486872962950807 , Out of Sample Test Score : 0.7482884273948067
Training RMSE : 1552.892778436124 Testing RMSE : 1556.0327257032734
Training MAPE : 16.99620812052566 Testing MAPE : 17.053657649224977
In [222]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Linear")
Out[222]:
Text(0.5,1,'Residual plot in Linear')

Bayesian Ridge Regression

In [223]:
from sklearn.linear_model import BayesianRidge
rdg = BayesianRidge()
rdgreg = rdg.fit(xd_train, yd_train)
In [224]:
yd_predicted = rdgreg.predict(xd_train)
yd_test_predicted = rdgreg.predict(xd_test)

ridge_train_score = rdgreg.score(xd_train, yd_train)
ridge_test_score = rdgreg.score(xd_test, yd_test)
ridge_train_mape = mape(yd_train, yd_predicted)
ridge_test_mape = mape(yd_test, yd_test_predicted)

# validation
print("Regresion Model Score" , ":" , ridge_train_score , "," ,
      "Out of Sample Test Score" ,":" , ridge_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
      "Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", ridge_train_mape,
      "Testing MAPE", ":", ridge_test_mape)
Regresion Model Score : 0.7486872941067197 , Out of Sample Test Score : 0.7482882999857583
Training RMSE : 1552.892785197202 Testing RMSE : 1556.0331195123892
Training MAPE : 16.996135206429493 Testing MAPE : 17.053588022278625
In [225]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Bayesian Ridge")
Out[225]:
Text(0.5,1,'Residual plot in Bayesian Ridge')

LARS Lasso Regression

In [226]:
from sklearn.linear_model import LassoLars
las = LassoLars(alpha=0.3, fit_intercept=False, normalize=True)
lasreg = las.fit(xd_train, yd_train)
In [227]:
yd_predicted = lasreg.predict(xd_train)
yd_test_predicted = lasreg.predict(xd_test)


lasso_train_score = lasreg.score(xd_train, yd_train)
lasso_test_score = lasreg.score(xd_test, yd_test)
lasso_train_mape = mape(yd_train, yd_predicted)
lasso_test_mape = mape(yd_test, yd_test_predicted)


print("Regresion Model Score" , ":" , lasso_train_score , "," ,
      "Out of Sample Test Score" ,":" , lasso_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
      "Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", lasso_train_mape,
      "Testing MAPE", ":", lasso_test_mape)
Regresion Model Score : 0.7486548608147864 , Out of Sample Test Score : 0.7482757514380954
Training RMSE : 1552.9929866573425 Testing RMSE : 1556.0719053781515
Training MAPE : 16.99090592730715 Testing MAPE : 17.04779653117055
In [228]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in LARS Lasso")
Out[228]:
Text(0.5,1,'Residual plot in LARS Lasso')

Decision Tree Regression

In [229]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(min_samples_leaf=20)
treereg = tree.fit(xl_train, yl_train)
In [230]:
yl_predicted = treereg.predict(xl_train)
yl_test_predicted = treereg.predict(xl_test)

decision_train_score = treereg.score(xl_train, yl_train)
decision_test_score = treereg.score(xl_test, yl_test)
decision_train_mape = mape(yl_train, yl_predicted)
decision_test_mape = mape(yl_test, yl_test_predicted)

print("Regresion Model Score" , ":" , decision_train_score , "," ,
      "Out of Sample Test Score" ,":" , decision_test_score)
print("Training RMSE", ":", rmse(yl_train, yl_predicted),
      "Testing RMSE", ":", rmse(yl_test, yl_test_predicted))
print("Training MAPE", ":", decision_train_mape,
      "Testing MAPE", ":", decision_test_mape)
Regresion Model Score : 0.9027583009397228 , Out of Sample Test Score : 0.8746215340532151
Training RMSE : 965.9630593620384 Testing RMSE : 1098.1928479490336
Training MAPE : 9.775973915806745 Testing MAPE : 11.225769676560038
In [231]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yl_test_predicted, "true":yl_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Decision Tree")
Out[231]:
Text(0.5,1,'Residual plot in Decision Tree')

Random Forest Regression

In [233]:
from sklearn.ensemble import RandomForestRegressor
rdf = RandomForestRegressor(n_estimators=30)
rdfreg = rdf.fit(xl_train, yl_train)
In [234]:
from sklearn.ensemble import RandomForestRegressor
rdf = RandomForestRegressor(n_estimators=30)
rdfreg = rdf.fit(xl_train, yl_train)
yl_predicted = rdfreg.predict(xl_train)
yl_test_predicted = rdfreg.predict(xl_test)

randomforest_train_score = rdfreg.score(xl_train, yl_train)
randomforest_test_score = rdfreg.score(xl_test, yl_test)
randomforest_train_mape = mape(yl_train, yl_predicted)
randomforest_test_mape = mape(yl_test, yl_test_predicted)

print("Regresion Model Score" , ":" , randomforest_train_score , "," ,
      "Out of Sample Test Score" ,":" , randomforest_test_score)   
print("Training RMSE", ":", rmse(yl_train, yl_predicted),
      "Testing RMSE", ":", rmse(yl_test, yl_test_predicted))
print("Training MAPE", ":", randomforest_train_mape,
      "Testing MAPE", ":", randomforest_test_mape)
Regresion Model Score : 0.9877708225256921 , Out of Sample Test Score : 0.9193156476149353
Training RMSE : 342.5570207957362 Testing RMSE : 880.9713680060886
Training MAPE : 3.420060947848951 Testing MAPE : 9.03730573775501
In [235]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yl_test_predicted, "true":yl_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Random Forest Tree")
Out[235]:
Text(0.5,1,'Residual plot in Random Forest Tree')

K-Nearest Neighbors Regression

In [236]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors = 30)
knnreg = knn.fit(xd_train, yd_train)
In [237]:
yd_predicted = knnreg.predict(xd_train)
yd_test_predicted = knnreg.predict(xd_test)

knn_train_score = knnreg.score(xd_train, yd_train)
knn_test_score = knnreg.score(xd_test, yd_test)
knn_train_mape = mape(yd_train, yd_predicted)
knn_test_mape = mape(yd_test, yd_test_predicted)

print("Regresion Model Score" , ":" , knn_train_score , "," ,
      "Out of Sample Test Score" ,":" , knn_test_score)
print("Training RMSE", ":", rmse(yd_train, yd_predicted),
      "Testing RMSE", ":", rmse(yd_test, yd_test_predicted))
print("Training MAPE", ":", knn_train_mape,
      "Testing MAPE", ":", knn_test_mape)
Regresion Model Score : 0.6514919969772337 , Out of Sample Test Score : 0.6277013414400041
Training RMSE : 1828.6922605326138 Testing RMSE : 1892.399701290796
Training MAPE : 21.983167017591406 Testing MAPE : 22.787960576353292
In [238]:
import matplotlib

matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":yd_test_predicted, "true":yd_test})
preds["residuals"] = preds["true"] - preds["preds"]
preds = preds.sample(n=200)
preds.plot(x = "preds", y = "residuals",kind = "scatter")
plt.axis([0, 25000, -10000, 10000])
plt.title("Residual plot in Knn")
Out[238]:
Text(0.5,1,'Residual plot in Knn')

Plotting comparison between different models

In [240]:
train_error=[linear_train_mape,ridge_train_mape,lasso_train_mape,decision_train_mape,randomforest_train_mape, knn_train_mape]
test_error=[linear_test_mape,ridge_test_mape,lasso_test_mape,decision_test_mape,randomforest_test_mape, knn_test_mape]

col={'Train Error':train_error,'Test Error':test_error}
models=['Linear Regression','Bayesian Ridge Regression','LARS Lasso Regression','Decision Tree','Random Forest', 'K-Nearest Neighbours']
df=pd.DataFrame(data=col,index=models)
df.plot(kind='bar')
Out[240]:
<matplotlib.axes._subplots.AxesSubplot at 0x145ab09e8>
In [242]:
train_score=[linear_train_score,ridge_train_score,lasso_train_score,decision_train_score,randomforest_train_score, knn_train_score]
test_score=[linear_test_score,ridge_test_score,lasso_test_score,decision_test_score,randomforest_test_score, knn_test_score]

col={'Train Score':train_score,'Test Score':test_score}
models=['Linear Regression','Bayesian Ridge Regression','LARS Lasso Regression','Decision Tree','Random Forest', 'K-Nearest Neighbours']
df=pd.DataFrame(data=col,index=models)
df.plot(kind='bar')
Out[242]:
<matplotlib.axes._subplots.AxesSubplot at 0x146434748>

Feature importance

In [244]:
features = xl_train.columns
importances = rdfreg.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(8,10))
plt.title('Feature Importances', fontsize=20)
plt.barh(range(len(indices)), importances[indices], color='blue', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
Out[244]:
Text(0.5,0,'Relative Importance')

Trying out the XGBOOST Approach

In [245]:
def rmspe(y, yhat):
    return np.sqrt(np.mean((yhat / y-1) ** 2))

def rmspe_xg(yhat, y):
    y = np.expm1(y.get_label())
    yhat = np.expm1(yhat)
    return "rmspe", rmspe(y, yhat)

Tuning Parameters

  • eta: Step size used in updating weights. Lower value means slower training but better convergence.
  • num_round: Total number of iterations.
  • subsample: The ratio of training data used in each iteration; combat overfitting. Should be configured in the range of 30% to 80% of the training dataset, and compared to a value of 100% for no sampling.
  • colsample_bytree: The ratio of features used in each iteration, default 1.
  • max_depth: The maximum depth of each tree. If we do not limit max depth, gradient boosting would eventually overfit.
  • early_stopping_rounds: If there's no increase in validation score for a given number of iterations, the algorithm will stop early, also combats overfitting.
In [246]:
# base parameters
params = {
    'booster': 'gbtree', 
    'objective': 'reg:linear', # regression task
    'subsample': 0.8, # 80% of data to grow trees and prevent overfitting
    'colsample_bytree': 0.85, # 85% of features used
    'eta': 0.1, 
    'max_depth': 10, 
    'seed': 42} # for reproducible results
In [247]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor # wrapper
In [249]:
# XGB with xgboost library
yl_train = np.log(yl_train)
yl_test = np.log(yl_test)
dtrain = xgb.DMatrix(xl_train, yl_train)
dtest = xgb.DMatrix(xl_test, yl_test)

watchlist = [(dtrain, 'train'), (dtest, 'test')]

xgb_model = xgb.train(params, dtrain, 300, evals = watchlist,
                      early_stopping_rounds = 50, feval = rmspe_xg, verbose_eval = True)
[0]	train-rmse:7.44245	test-rmse:7.44272	train-rmspe:0.99953	test-rmspe:0.99953
Multiple eval metrics have been passed: 'test-rmspe' will be used for early stopping.

Will train until test-rmspe hasn't improved in 50 rounds.
[1]	train-rmse:6.69918	test-rmse:6.69954	train-rmspe:0.998834	test-rmspe:0.998834
[2]	train-rmse:6.03026	test-rmse:6.03064	train-rmspe:0.997581	test-rmspe:0.997581
[3]	train-rmse:5.42801	test-rmse:5.4284	train-rmspe:0.99548	test-rmspe:0.99548
[4]	train-rmse:4.8865	test-rmse:4.88687	train-rmspe:0.992146	test-rmspe:0.992147
[5]	train-rmse:4.39873	test-rmse:4.39912	train-rmspe:0.987184	test-rmspe:0.987186
[6]	train-rmse:3.95983	test-rmse:3.96023	train-rmspe:0.980133	test-rmspe:0.980141
[7]	train-rmse:3.56503	test-rmse:3.56544	train-rmspe:0.970545	test-rmspe:0.970564
[8]	train-rmse:3.20978	test-rmse:3.21018	train-rmspe:0.958056	test-rmspe:0.958096
[9]	train-rmse:2.89001	test-rmse:2.89041	train-rmspe:0.942381	test-rmspe:0.942461
[10]	train-rmse:2.6024	test-rmse:2.60282	train-rmspe:0.923332	test-rmspe:0.923486
[11]	train-rmse:2.34365	test-rmse:2.34409	train-rmspe:0.900868	test-rmspe:0.901146
[12]	train-rmse:2.11078	test-rmse:2.11124	train-rmspe:0.875114	test-rmspe:0.875579
[13]	train-rmse:1.90158	test-rmse:1.90205	train-rmspe:0.846225	test-rmspe:0.846959
[14]	train-rmse:1.71314	test-rmse:1.71365	train-rmspe:0.814679	test-rmspe:0.815798
[15]	train-rmse:1.54379	test-rmse:1.54433	train-rmspe:0.780794	test-rmspe:0.782484
[16]	train-rmse:1.39144	test-rmse:1.39203	train-rmspe:0.745107	test-rmspe:0.747543
[17]	train-rmse:1.25447	test-rmse:1.25507	train-rmspe:0.708123	test-rmspe:0.711507
[18]	train-rmse:1.13143	test-rmse:1.13206	train-rmspe:0.670315	test-rmspe:0.674782
[19]	train-rmse:1.02084	test-rmse:1.02151	train-rmspe:0.632251	test-rmspe:0.63821
[20]	train-rmse:0.921519	test-rmse:0.922227	train-rmspe:0.594381	test-rmspe:0.602047
[21]	train-rmse:0.832404	test-rmse:0.833164	train-rmspe:0.557068	test-rmspe:0.566969
[22]	train-rmse:0.75244	test-rmse:0.753264	train-rmspe:0.520668	test-rmspe:0.533181
[23]	train-rmse:0.680713	test-rmse:0.681587	train-rmspe:0.485651	test-rmspe:0.501056
[24]	train-rmse:0.616447	test-rmse:0.617397	train-rmspe:0.452159	test-rmspe:0.470682
[25]	train-rmse:0.559011	test-rmse:0.560027	train-rmspe:0.420459	test-rmspe:0.442653
[26]	train-rmse:0.507665	test-rmse:0.508757	train-rmspe:0.390709	test-rmspe:0.417103
[27]	train-rmse:0.461724	test-rmse:0.462906	train-rmspe:0.362906	test-rmspe:0.393951
[28]	train-rmse:0.420973	test-rmse:0.42224	train-rmspe:0.337138	test-rmspe:0.373393
[29]	train-rmse:0.384603	test-rmse:0.385981	train-rmspe:0.313586	test-rmspe:0.355096
[30]	train-rmse:0.352364	test-rmse:0.353826	train-rmspe:0.292175	test-rmspe:0.339263
[31]	train-rmse:0.323794	test-rmse:0.325376	train-rmspe:0.272825	test-rmspe:0.325734
[32]	train-rmse:0.29859	test-rmse:0.300317	train-rmspe:0.255533	test-rmspe:0.3144
[33]	train-rmse:0.276473	test-rmse:0.278341	train-rmspe:0.240232	test-rmspe:0.30504
[34]	train-rmse:0.256911	test-rmse:0.258934	train-rmspe:0.226573	test-rmspe:0.297321
[35]	train-rmse:0.239738	test-rmse:0.241915	train-rmspe:0.214566	test-rmspe:0.290745
[36]	train-rmse:0.224924	test-rmse:0.227254	train-rmspe:0.204385	test-rmspe:0.286135
[37]	train-rmse:0.212052	test-rmse:0.214536	train-rmspe:0.195586	test-rmspe:0.282561
[38]	train-rmse:0.200834	test-rmse:0.203477	train-rmspe:0.188075	test-rmspe:0.279602
[39]	train-rmse:0.191202	test-rmse:0.193999	train-rmspe:0.181663	test-rmspe:0.27671
[40]	train-rmse:0.182824	test-rmse:0.185756	train-rmspe:0.176134	test-rmspe:0.274865
[41]	train-rmse:0.175796	test-rmse:0.178855	train-rmspe:0.17179	test-rmspe:0.273762
[42]	train-rmse:0.169565	test-rmse:0.172734	train-rmspe:0.167891	test-rmspe:0.271688
[43]	train-rmse:0.164291	test-rmse:0.167571	train-rmspe:0.164707	test-rmspe:0.272989
[44]	train-rmse:0.159939	test-rmse:0.163348	train-rmspe:0.162271	test-rmspe:0.272732
[45]	train-rmse:0.156268	test-rmse:0.159773	train-rmspe:0.160396	test-rmspe:0.272598
[46]	train-rmse:0.153122	test-rmse:0.156712	train-rmspe:0.158854	test-rmspe:0.272168
[47]	train-rmse:0.150475	test-rmse:0.154163	train-rmspe:0.157311	test-rmspe:0.27243
[48]	train-rmse:0.148232	test-rmse:0.152015	train-rmspe:0.156285	test-rmspe:0.272491
[49]	train-rmse:0.146324	test-rmse:0.150185	train-rmspe:0.155405	test-rmspe:0.273044
[50]	train-rmse:0.144321	test-rmse:0.148258	train-rmspe:0.154135	test-rmspe:0.271865
[51]	train-rmse:0.142843	test-rmse:0.146866	train-rmspe:0.153475	test-rmspe:0.27204
[52]	train-rmse:0.140994	test-rmse:0.145116	train-rmspe:0.152332	test-rmspe:0.271763
[53]	train-rmse:0.140069	test-rmse:0.144241	train-rmspe:0.151998	test-rmspe:0.272038
[54]	train-rmse:0.138917	test-rmse:0.143168	train-rmspe:0.151387	test-rmspe:0.272138
[55]	train-rmse:0.138121	test-rmse:0.142419	train-rmspe:0.151119	test-rmspe:0.272361
[56]	train-rmse:0.137221	test-rmse:0.141565	train-rmspe:0.150501	test-rmspe:0.272382
[57]	train-rmse:0.13638	test-rmse:0.140793	train-rmspe:0.150039	test-rmspe:0.27215
[58]	train-rmse:0.135711	test-rmse:0.140192	train-rmspe:0.149787	test-rmspe:0.272339
[59]	train-rmse:0.134792	test-rmse:0.139331	train-rmspe:0.149012	test-rmspe:0.272987
[60]	train-rmse:0.134235	test-rmse:0.138823	train-rmspe:0.148709	test-rmspe:0.273139
[61]	train-rmse:0.133616	test-rmse:0.138248	train-rmspe:0.148319	test-rmspe:0.273377
[62]	train-rmse:0.133182	test-rmse:0.137888	train-rmspe:0.147953	test-rmspe:0.273567
[63]	train-rmse:0.132641	test-rmse:0.137392	train-rmspe:0.147607	test-rmspe:0.273198
[64]	train-rmse:0.131965	test-rmse:0.136744	train-rmspe:0.147014	test-rmspe:0.275219
[65]	train-rmse:0.131514	test-rmse:0.136315	train-rmspe:0.146771	test-rmspe:0.275278
[66]	train-rmse:0.131099	test-rmse:0.135961	train-rmspe:0.146412	test-rmspe:0.275318
[67]	train-rmse:0.130578	test-rmse:0.135489	train-rmspe:0.145996	test-rmspe:0.275119
[68]	train-rmse:0.130117	test-rmse:0.135092	train-rmspe:0.145526	test-rmspe:0.274319
[69]	train-rmse:0.12971	test-rmse:0.134728	train-rmspe:0.145122	test-rmspe:0.274026
[70]	train-rmse:0.129181	test-rmse:0.134243	train-rmspe:0.144622	test-rmspe:0.273689
[71]	train-rmse:0.128842	test-rmse:0.133972	train-rmspe:0.144313	test-rmspe:0.273716
[72]	train-rmse:0.128554	test-rmse:0.13374	train-rmspe:0.144097	test-rmspe:0.273567
[73]	train-rmse:0.128269	test-rmse:0.133493	train-rmspe:0.143873	test-rmspe:0.273526
[74]	train-rmse:0.128083	test-rmse:0.133332	train-rmspe:0.143734	test-rmspe:0.273476
[75]	train-rmse:0.127696	test-rmse:0.132986	train-rmspe:0.143312	test-rmspe:0.273556
[76]	train-rmse:0.127506	test-rmse:0.132852	train-rmspe:0.14316	test-rmspe:0.273487
[77]	train-rmse:0.127271	test-rmse:0.13268	train-rmspe:0.142864	test-rmspe:0.273392
[78]	train-rmse:0.126221	test-rmse:0.1317	train-rmspe:0.141646	test-rmspe:0.272131
[79]	train-rmse:0.125864	test-rmse:0.131408	train-rmspe:0.141256	test-rmspe:0.27171
[80]	train-rmse:0.125527	test-rmse:0.131106	train-rmspe:0.140902	test-rmspe:0.273793
[81]	train-rmse:0.12533	test-rmse:0.130978	train-rmspe:0.140726	test-rmspe:0.273645
[82]	train-rmse:0.125151	test-rmse:0.130866	train-rmspe:0.140556	test-rmspe:0.27357
[83]	train-rmse:0.124859	test-rmse:0.130633	train-rmspe:0.140247	test-rmspe:0.273473
[84]	train-rmse:0.124705	test-rmse:0.130497	train-rmspe:0.140098	test-rmspe:0.27442
[85]	train-rmse:0.124537	test-rmse:0.130345	train-rmspe:0.139926	test-rmspe:0.274295
[86]	train-rmse:0.12391	test-rmse:0.129732	train-rmspe:0.139241	test-rmspe:0.273805
[87]	train-rmse:0.12362	test-rmse:0.129505	train-rmspe:0.138942	test-rmspe:0.273562
[88]	train-rmse:0.123405	test-rmse:0.129304	train-rmspe:0.138735	test-rmspe:0.273363
[89]	train-rmse:0.123058	test-rmse:0.12898	train-rmspe:0.138374	test-rmspe:0.273151
[90]	train-rmse:0.122361	test-rmse:0.128336	train-rmspe:0.137566	test-rmspe:0.272255
[91]	train-rmse:0.122067	test-rmse:0.12806	train-rmspe:0.137233	test-rmspe:0.271998
[92]	train-rmse:0.121776	test-rmse:0.127803	train-rmspe:0.136908	test-rmspe:0.273444
Stopping. Best iteration:
[42]	train-rmse:0.169565	test-rmse:0.172734	train-rmspe:0.167891	test-rmspe:0.271688

Let's try out Hyperparameter Tuning

In [251]:
# XGB with sklearn wrapper
# the same parameters as for xgboost model
params_sk = {'max_depth': 10, 
            'n_estimators': 300, # the same as num_rounds in xgboost
            'objective': 'reg:linear', 
            'subsample': 0.8, 
            'colsample_bytree': 0.85, 
            'learning_rate': 0.1, 
            'seed': 42}     

skrg = XGBRegressor(**params_sk)

skrg.fit(xl_train, yl_train)
Out[251]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.85, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)
In [253]:
import scipy.stats as st
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params_grid = {  
    'learning_rate': st.uniform(0.01, 0.3),
    'max_depth': list(range(10, 20, 2)),
    'gamma': st.uniform(0, 10),
    'reg_alpha': st.expon(0, 50)}

search_sk = RandomizedSearchCV(skrg, params_grid, cv = 5) # 5 fold cross validation
search_sk.fit(xl_train, yl_train)

# best parameters
print(search_sk.best_params_); print(search_sk.best_score_)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-253-e1193835fe00> in <module>()
      9 
     10 search_sk = RandomizedSearchCV(skrg, params_grid, cv = 5) # 5 fold cross validation
---> 11 search_sk.fit(xl_train, yl_train)
     12 
     13 # best parameters

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
    640 
    641         # if one choose to see train score, "out" will contain train score info

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
    459 
    460     except Exception as e:

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model)
    291                               early_stopping_rounds=early_stopping_rounds,
    292                               evals_result=evals_result, obj=obj, feval=feval,
--> 293                               verbose_eval=verbose, xgb_model=xgb_model)
    294 
    295         if evals_result:

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
    202                            evals=evals,
    203                            obj=obj, feval=feval,
--> 204                            xgb_model=xgb_model, callbacks=callbacks)
    205 
    206 

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
    893         if fobj is None:
    894             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
--> 895                                                     dtrain.handle))
    896         else:
    897             pred = self.predict(dtrain)

KeyboardInterrupt: 
In [260]:
# with new parameters
params_new = {
    'booster': 'gbtree', 
    'objective': 'reg:linear', 
    'subsample': 0.8, 
    'colsample_bytree': 0.85, 
    'eta': 0.044338624448041611, 
    'max_depth': 16, 
    'gamma': 0.80198330585415034,
    'reg_alpha': 23.008226565535971,
    'seed': 42} 

model_final = xgb.train(params_new, dtrain, 300, evals = watchlist,
                        early_stopping_rounds = 50, feval = rmspe_xg, verbose_eval = True)
[0]	train-rmse:7.90223	test-rmse:7.90249	train-rmspe:0.999764	test-rmspe:0.999764
Multiple eval metrics have been passed: 'test-rmspe' will be used for early stopping.

Will train until test-rmspe hasn't improved in 50 rounds.
[1]	train-rmse:7.55233	test-rmse:7.55262	train-rmspe:0.999596	test-rmspe:0.999595
[2]	train-rmse:7.21794	test-rmse:7.21823	train-rmspe:0.99937	test-rmspe:0.99937
[3]	train-rmse:6.89827	test-rmse:6.89857	train-rmspe:0.999072	test-rmspe:0.999072
[4]	train-rmse:6.59301	test-rmse:6.59331	train-rmspe:0.998683	test-rmspe:0.998683
[5]	train-rmse:6.30106	test-rmse:6.30139	train-rmspe:0.998187	test-rmspe:0.998187
[6]	train-rmse:6.02208	test-rmse:6.02243	train-rmspe:0.997558	test-rmspe:0.997558
[7]	train-rmse:5.75559	test-rmse:5.75594	train-rmspe:0.996768	test-rmspe:0.996769
[8]	train-rmse:5.50092	test-rmse:5.50127	train-rmspe:0.995792	test-rmspe:0.995793
[9]	train-rmse:5.25746	test-rmse:5.25783	train-rmspe:0.994602	test-rmspe:0.994603
[10]	train-rmse:5.02491	test-rmse:5.02528	train-rmspe:0.993159	test-rmspe:0.99316
[11]	train-rmse:4.80269	test-rmse:4.80306	train-rmspe:0.991432	test-rmspe:0.991433
[12]	train-rmse:4.59023	test-rmse:4.59062	train-rmspe:0.989388	test-rmspe:0.989391
[13]	train-rmse:4.3875	test-rmse:4.38788	train-rmspe:0.986974	test-rmspe:0.986979
[14]	train-rmse:4.19349	test-rmse:4.19389	train-rmspe:0.984184	test-rmspe:0.984191
[15]	train-rmse:4.0082	test-rmse:4.00861	train-rmspe:0.980958	test-rmspe:0.980968
[16]	train-rmse:3.83112	test-rmse:3.83152	train-rmspe:0.977272	test-rmspe:0.977287
[17]	train-rmse:3.66183	test-rmse:3.66223	train-rmspe:0.973097	test-rmspe:0.973119
[18]	train-rmse:3.50015	test-rmse:3.50056	train-rmspe:0.96839	test-rmspe:0.968421
[19]	train-rmse:3.34558	test-rmse:3.34599	train-rmspe:0.963137	test-rmspe:0.96318
[20]	train-rmse:3.1979	test-rmse:3.19832	train-rmspe:0.957307	test-rmspe:0.957366
[21]	train-rmse:3.05679	test-rmse:3.05723	train-rmspe:0.95088	test-rmspe:0.950961
[22]	train-rmse:2.92201	test-rmse:2.92245	train-rmspe:0.943829	test-rmspe:0.943937
[23]	train-rmse:2.79319	test-rmse:2.79365	train-rmspe:0.93616	test-rmspe:0.936304
[24]	train-rmse:2.6701	test-rmse:2.67055	train-rmspe:0.927856	test-rmspe:0.928044
[25]	train-rmse:2.55254	test-rmse:2.553	train-rmspe:0.91891	test-rmspe:0.919153
[26]	train-rmse:2.4402	test-rmse:2.44067	train-rmspe:0.909339	test-rmspe:0.909649
[27]	train-rmse:2.33286	test-rmse:2.33334	train-rmspe:0.899143	test-rmspe:0.899532
[28]	train-rmse:2.23043	test-rmse:2.23091	train-rmspe:0.888316	test-rmspe:0.888796
[29]	train-rmse:2.13245	test-rmse:2.13294	train-rmspe:0.876916	test-rmspe:0.877511
[30]	train-rmse:2.03897	test-rmse:2.03946	train-rmspe:0.864909	test-rmspe:0.865635
[31]	train-rmse:1.94955	test-rmse:1.95003	train-rmspe:0.85239	test-rmspe:0.853271
[32]	train-rmse:1.86414	test-rmse:1.86463	train-rmspe:0.839345	test-rmspe:0.840407
[33]	train-rmse:1.78256	test-rmse:1.78306	train-rmspe:0.825816	test-rmspe:0.827092
[34]	train-rmse:1.70462	test-rmse:1.70513	train-rmspe:0.811835	test-rmspe:0.813352
[35]	train-rmse:1.63017	test-rmse:1.6307	train-rmspe:0.797445	test-rmspe:0.79925
[36]	train-rmse:1.5591	test-rmse:1.55963	train-rmspe:0.782661	test-rmspe:0.784783
[37]	train-rmse:1.49124	test-rmse:1.49178	train-rmspe:0.767545	test-rmspe:0.769971
[38]	train-rmse:1.42642	test-rmse:1.42696	train-rmspe:0.752138	test-rmspe:0.754959
[39]	train-rmse:1.36451	test-rmse:1.36506	train-rmspe:0.736487	test-rmspe:0.73975
[40]	train-rmse:1.30537	test-rmse:1.30593	train-rmspe:0.720637	test-rmspe:0.724326
[41]	train-rmse:1.24893	test-rmse:1.24949	train-rmspe:0.704606	test-rmspe:0.708826
[42]	train-rmse:1.19503	test-rmse:1.19559	train-rmspe:0.688452	test-rmspe:0.693227
[43]	train-rmse:1.14356	test-rmse:1.14413	train-rmspe:0.672211	test-rmspe:0.677513
[44]	train-rmse:1.09445	test-rmse:1.09502	train-rmspe:0.655942	test-rmspe:0.6619
[45]	train-rmse:1.04753	test-rmse:1.04811	train-rmspe:0.639687	test-rmspe:0.646411
[46]	train-rmse:1.00279	test-rmse:1.00338	train-rmspe:0.623462	test-rmspe:0.631007
[47]	train-rmse:0.960109	test-rmse:0.960704	train-rmspe:0.607305	test-rmspe:0.615753
[48]	train-rmse:0.919335	test-rmse:0.919932	train-rmspe:0.591279	test-rmspe:0.600686
[49]	train-rmse:0.880476	test-rmse:0.881077	train-rmspe:0.575391	test-rmspe:0.58583
[50]	train-rmse:0.843352	test-rmse:0.843946	train-rmspe:0.559608	test-rmspe:0.570859
[51]	train-rmse:0.807989	test-rmse:0.808588	train-rmspe:0.544099	test-rmspe:0.556517
[52]	train-rmse:0.774215	test-rmse:0.774828	train-rmspe:0.528792	test-rmspe:0.542369
[53]	train-rmse:0.742033	test-rmse:0.742656	train-rmspe:0.513743	test-rmspe:0.528583
[54]	train-rmse:0.71135	test-rmse:0.71197	train-rmspe:0.498923	test-rmspe:0.514762
[55]	train-rmse:0.682142	test-rmse:0.682773	train-rmspe:0.484461	test-rmspe:0.501763
[56]	train-rmse:0.654361	test-rmse:0.655	train-rmspe:0.470335	test-rmspe:0.489177
[57]	train-rmse:0.627827	test-rmse:0.628466	train-rmspe:0.45648	test-rmspe:0.476662
[58]	train-rmse:0.602492	test-rmse:0.603142	train-rmspe:0.443041	test-rmspe:0.464673
[59]	train-rmse:0.578388	test-rmse:0.579044	train-rmspe:0.429888	test-rmspe:0.45307
[60]	train-rmse:0.555472	test-rmse:0.556137	train-rmspe:0.417201	test-rmspe:0.442169
[61]	train-rmse:0.533663	test-rmse:0.534331	train-rmspe:0.404933	test-rmspe:0.431757
[62]	train-rmse:0.51301	test-rmse:0.513685	train-rmspe:0.393069	test-rmspe:0.421868
[63]	train-rmse:0.493317	test-rmse:0.494003	train-rmspe:0.38151	test-rmspe:0.412277
[64]	train-rmse:0.474554	test-rmse:0.475255	train-rmspe:0.370337	test-rmspe:0.403355
[65]	train-rmse:0.456875	test-rmse:0.457582	train-rmspe:0.359734	test-rmspe:0.394968
[66]	train-rmse:0.43993	test-rmse:0.440644	train-rmspe:0.3494	test-rmspe:0.386502
[67]	train-rmse:0.423856	test-rmse:0.424588	train-rmspe:0.339446	test-rmspe:0.378861
[68]	train-rmse:0.408539	test-rmse:0.409283	train-rmspe:0.329946	test-rmspe:0.371593
[69]	train-rmse:0.394066	test-rmse:0.394815	train-rmspe:0.320954	test-rmspe:0.364904
[70]	train-rmse:0.380367	test-rmse:0.381133	train-rmspe:0.312367	test-rmspe:0.358781
[71]	train-rmse:0.367447	test-rmse:0.368228	train-rmspe:0.304175	test-rmspe:0.353086
[72]	train-rmse:0.354973	test-rmse:0.355762	train-rmspe:0.296267	test-rmspe:0.346733
[73]	train-rmse:0.343292	test-rmse:0.344092	train-rmspe:0.288876	test-rmspe:0.341688
[74]	train-rmse:0.332279	test-rmse:0.333085	train-rmspe:0.281865	test-rmspe:0.337042
[75]	train-rmse:0.322007	test-rmse:0.322826	train-rmspe:0.275333	test-rmspe:0.332915
[76]	train-rmse:0.312165	test-rmse:0.313004	train-rmspe:0.269121	test-rmspe:0.329009
[77]	train-rmse:0.302892	test-rmse:0.303744	train-rmspe:0.263262	test-rmspe:0.325529
[78]	train-rmse:0.293974	test-rmse:0.294837	train-rmspe:0.257463	test-rmspe:0.321261
[79]	train-rmse:0.285751	test-rmse:0.286625	train-rmspe:0.252334	test-rmspe:0.318355
[80]	train-rmse:0.277894	test-rmse:0.278789	train-rmspe:0.24737	test-rmspe:0.315361
[81]	train-rmse:0.270461	test-rmse:0.27137	train-rmspe:0.242746	test-rmspe:0.31291
[82]	train-rmse:0.263576	test-rmse:0.264507	train-rmspe:0.238479	test-rmspe:0.310975
[83]	train-rmse:0.257172	test-rmse:0.258105	train-rmspe:0.234638	test-rmspe:0.30917
[84]	train-rmse:0.251029	test-rmse:0.251972	train-rmspe:0.230938	test-rmspe:0.306791
[85]	train-rmse:0.245073	test-rmse:0.246034	train-rmspe:0.226952	test-rmspe:0.304368
[86]	train-rmse:0.239692	test-rmse:0.240663	train-rmspe:0.223733	test-rmspe:0.303
[87]	train-rmse:0.23482	test-rmse:0.235796	train-rmspe:0.220966	test-rmspe:0.301872
[88]	train-rmse:0.230135	test-rmse:0.231125	train-rmspe:0.218251	test-rmspe:0.300834
[89]	train-rmse:0.225877	test-rmse:0.226876	train-rmspe:0.215927	test-rmspe:0.300182
[90]	train-rmse:0.221632	test-rmse:0.22264	train-rmspe:0.213555	test-rmspe:0.298771
[91]	train-rmse:0.217792	test-rmse:0.218805	train-rmspe:0.2114	test-rmspe:0.298176
[92]	train-rmse:0.214154	test-rmse:0.215177	train-rmspe:0.209386	test-rmspe:0.29763
[93]	train-rmse:0.21075	test-rmse:0.211773	train-rmspe:0.207544	test-rmspe:0.297213
[94]	train-rmse:0.207169	test-rmse:0.208213	train-rmspe:0.205193	test-rmspe:0.296076
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-260-b7617bce1a8a> in <module>()
     12 
     13 model_final = xgb.train(params_new, dtrain, 300, evals = watchlist,
---> 14                         early_stopping_rounds = 50, feval = rmspe_xg, verbose_eval = True)

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)
    202                            evals=evals,
    203                            obj=obj, feval=feval,
--> 204                            xgb_model=xgb_model, callbacks=callbacks)
    205 
    206 

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     72         # Skip the first update if it is a recovery step.
     73         if version % 2 == 0:
---> 74             bst.update(dtrain, i, obj)
     75             bst.save_rabit_checkpoint()
     76             version += 1

~/anaconda3/envs/pallavi/lib/python3.6/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
    893         if fobj is None:
    894             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),
--> 895                                                     dtrain.handle))
    896         else:
    897             pred = self.predict(dtrain)

KeyboardInterrupt: 
In [263]:
xgb.plot_tree(xgb_model, num_trees=2)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(150, 100)
fig.savefig('tree.png')
In [264]:
xgb.plot_importance(xgb_model)
Out[264]:
<matplotlib.axes._subplots.AxesSubplot at 0x140dcddd8>